import pandas as pd
import stanza
import spacy_stanza
import re
import tqdm as tq
import swifter

# 1. Load Preprocessed Data
result = pd.read_feather("FoxMSNBC_cleandata_2.feather")

# 2. Load NLP Model (Stanza + spaCy)
nlp = spacy_stanza.load_pipeline('en', processors='tokenize,pos,ner,lemma', use_gpu=True)
nlp.add_pipe("merge_entities")

# 3. Define Text Cleaning and Lemmatization Functions

def cleaning(text):
    text = re.sub(r"(\n|`s|'s|’s|`m|'m|’m|`ve|'ve|’ve|`re|'re|’re|`am|'am|’am|`t|'t|’t|`d|'d|’d|n`t|n't|n’t|`ll|'ll|’ll)", " ", text)
    text = text.replace('\\', ' ').replace('_', ' ').replace('-', ' ')
    text = re.sub(r'\S*@\S*\s?', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text


def lemma_tag(text):
    doc = nlp(text)
    tagged_text = []
    for token in doc:
        tag_type = token.ent_type_ if token.ent_type_ else token.tag_
        tagged_text.append([token.lemma_.lower(), tag_type, token.text, str(token.i)])
    return tagged_text

# 4. Apply Preprocessing
start_time = pd.Timestamp.now()
tq.tqdm.pandas(desc='Text Processing')
result['Content_tagged'] = result['Content'].progress_apply(lambda x: lemma_tag(cleaning(x)))
print("Processing time:", pd.Timestamp.now() - start_time)

# 5. Add Original Text with Index
def original_text_index(text):
    return [f"{original}({index})" for _, _, original, index in text]

result['Content_WithIndex'] = result['Content_tagged'].apply(original_text_index)

# 6. POS Filtering and Stopword Removal
exclude_tags = ["DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL", "AFX", "PDT", "PRP$", "WDT", "WP$", "IN", "EX", "WRB", "CC", "DT", "UH", "WP", "CD", "POS", "TO", "PRP", "-LRB-", "-RRB-", ",", ":", ".", "”", "“", "RB", "RBR", "RBS", "HYPH", "LS", "NFP", "#", "$", "SYM", "BES", "HVS", "MD", "ADD", "XX", "_SP"]
stopwords = list(nlp.Defaults.stop_words)
stopwords.extend(["let", "thanks", "thank", "want", "ask", "tell"])
stopwords.remove("same")

def tag_filter(rows):
    filtered_words = []
    for word, tag, _, index in rows:
        if tag not in exclude_tags and word not in stopwords and 1 < len(word) < 50:
            filtered_words.append(f"{word.replace(' ', '_')}({tag},{index})")
    return filtered_words

result['Content_filtered'] = result['Content_tagged'].swifter.apply(tag_filter)

# 7. Multiple Word Expressions (MWE)
with open("MWE_list.txt", "r") as fd:
    MWE_list = fd.read().splitlines()

def apply_MWE(text_list):
    text_joined = ' '.join(text_list)
    for mwe in sorted(MWE_list, key=lambda x: x.count('-'), reverse=True):
        regex_pattern = ' '.join([f"{word}\([A-Z]+,[0-9]+\)" for word in mwe.split('-')])
        replacement = f"{mwe}(MWE)"
        text_joined = re.sub(regex_pattern, replacement, text_joined)
    return text_joined.split(' ')

result['Content_final'] = result['Content_filtered'].swifter.apply(apply_MWE)

# 8. Final Cleanup and Save
del result["Content_tagged"]
del result["Content_filtered"]
result.to_feather('result_Stanza_ReadyToR.feather')

